This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# Load necessary libraries
#install.packages("readr")
library(readr)
## Warning: package 'readr' was built under R version 4.4.1
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(shiny)
## Warning: package 'shiny' was built under R version 4.4.1
#install.packages("reshape2")
library(reshape2)
# Step 1: Load and Explore the Data
# Load the datasets
train_data = read.csv(file.choose(), header = TRUE)
test_data = read.csv(file.choose(), header = TRUE)
# View the first few rows of the datasets
head(train_data)
## ID fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 1 7.2 0.34 0.34 12.6 0.048
## 2 2 6.0 0.27 0.28 4.8 0.063
## 3 3 6.9 0.26 0.49 1.6 0.058
## 4 4 6.6 0.25 0.34 3.0 0.054
## 5 5 7.1 0.17 0.43 1.3 0.023
## 6 6 6.0 0.29 0.25 1.4 0.033
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol type
## 1 7 41 0.99420 3.19 0.40 11.7 white
## 2 31 201 0.99640 3.69 0.71 10.0 white
## 3 39 166 0.99650 3.65 0.52 9.4 white
## 4 22 141 0.99338 3.26 0.47 10.4 white
## 5 33 132 0.99067 3.11 0.56 11.7 white
## 6 30 114 0.98794 3.08 0.43 13.2 white
## location quality
## 1 Texas 5
## 2 Texas 5
## 3 Texas 4
## 4 California 6
## 5 California 6
## 6 California 6
summary(train_data)
## ID fixed.acidity volatile.acidity citric.acid
## Min. : 1 Min. : 3.800 Min. :0.0800 Min. :0.0000
## 1st Qu.:1366 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500
## Median :2732 Median : 7.000 Median :0.2900 Median :0.3100
## Mean :2732 Mean : 7.218 Mean :0.3382 Mean :0.3185
## 3rd Qu.:4098 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900
## Max. :5463 Max. :15.900 Max. :1.5800 Max. :1.6600
## residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. : 0.60 Min. :0.00900 Min. : 1.00 Min. : 6.0
## 1st Qu.: 1.80 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 78.0
## Median : 3.00 Median :0.04700 Median : 29.00 Median :118.0
## Mean : 5.42 Mean :0.05613 Mean : 30.58 Mean :115.9
## 3rd Qu.: 8.10 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:155.0
## Max. :31.60 Max. :0.61100 Max. :289.00 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.0
## 1st Qu.:0.9923 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.5
## Median :0.9949 Median :3.210 Median :0.5100 Median :10.3
## Mean :0.9947 Mean :3.217 Mean :0.5318 Mean :10.5
## 3rd Qu.:0.9969 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:11.3
## Max. :1.0103 Max. :4.010 Max. :2.0000 Max. :14.9
## type location quality
## Length:5463 Length:5463 Min. :3.000
## Class :character Class :character 1st Qu.:5.000
## Mode :character Mode :character Median :6.000
## Mean :5.823
## 3rd Qu.:6.000
## Max. :9.000
head(test_data)
## ID fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 5464 6.6 0.840 0.03 2.3 0.059
## 2 5465 7.2 0.540 0.27 2.6 0.084
## 3 5466 8.9 0.565 0.34 3.0 0.093
## 4 5467 6.7 0.130 0.32 3.7 0.017
## 5 5468 7.0 0.570 0.02 2.0 0.072
## 6 5469 7.0 0.170 0.31 4.8 0.034
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol type
## 1 32 48 0.99520 3.52 0.56 12.3 red
## 2 12 78 0.99640 3.39 0.71 11.0 red
## 3 16 112 0.99980 3.38 0.61 9.5 red
## 4 32 99 0.99348 3.12 0.44 10.0 white
## 5 17 26 0.99575 3.36 0.61 10.2 red
## 6 34 132 0.99440 3.36 0.48 9.6 white
## location
## 1 California
## 2 Texas
## 3 Texas
## 4 California
## 5 Texas
## 6 California
summary(test_data)
## ID fixed.acidity volatile.acidity citric.acid
## Min. :5464 Min. : 4.200 Min. :0.0850 Min. :0.0000
## 1st Qu.:5722 1st Qu.: 6.400 1st Qu.:0.2200 1st Qu.:0.2400
## Median :5980 Median : 7.000 Median :0.3000 Median :0.3100
## Mean :5980 Mean : 7.204 Mean :0.3474 Mean :0.3195
## 3rd Qu.:6239 3rd Qu.: 7.600 3rd Qu.:0.4300 3rd Qu.:0.3975
## Max. :6497 Max. :15.500 Max. :0.9650 Max. :1.2300
## residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. : 0.600 Min. :0.01200 Min. : 1.00 Min. : 7.00
## 1st Qu.: 1.900 1st Qu.:0.03800 1st Qu.: 16.00 1st Qu.: 75.25
## Median : 3.300 Median :0.04700 Median : 29.00 Median :118.00
## Mean : 5.566 Mean :0.05551 Mean : 30.23 Mean :115.01
## 3rd Qu.: 8.200 3rd Qu.:0.06475 3rd Qu.: 41.75 3rd Qu.:157.00
## Max. :65.800 Max. :0.41300 Max. :108.00 Max. :294.00
## density pH sulphates alcohol
## Min. :0.9875 Min. :2.800 Min. :0.2600 Min. : 8.50
## 1st Qu.:0.9924 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.50
## Median :0.9950 Median :3.210 Median :0.5000 Median :10.30
## Mean :0.9948 Mean :3.225 Mean :0.5285 Mean :10.46
## 3rd Qu.:0.9972 3rd Qu.:3.330 3rd Qu.:0.6000 3rd Qu.:11.29
## Max. :1.0390 Max. :3.900 Max. :1.6200 Max. :14.20
## type location
## Length:1034 Length:1034
## Class :character Class :character
## Mode :character Mode :character
##
##
##
# Check for missing values
sum(is.na(train_data))
## [1] 0
sum(is.na(test_data))
## [1] 0
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# Pair plot to visualize relationships between all numerical variables
#ggpairs(train_data)
# Histogram for each parameter
for(col in colnames(train_data)) {
if(is.numeric(train_data[[col]])) {
print(ggplot(train_data, aes_string(x = col)) +
geom_histogram(binwidth = 80, fill = 'blue', color = 'black') +
ggtitle(paste("Histogram of", col)))
}
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Boxplot for each parameter to check for outliers
for(col in colnames(train_data)) {
if(is.numeric(train_data[[col]])) {
print(ggplot(train_data, aes_string(y = col)) +
geom_boxplot(fill = 'orange', color = 'black') +
ggtitle(paste("Boxplot of", col)))
}
}
# Scatter plots for each pair of numerical parameters
numeric_columns <- colnames(train_data)[sapply(train_data, is.numeric)]
for(i in 1:(length(numeric_columns)-1)) {
for(j in (i+1):length(numeric_columns)) {
print(ggplot(train_data, aes_string(x = numeric_columns[i], y = numeric_columns[j])) +
geom_point(color = 'purple') +
ggtitle(paste("Scatter plot of", numeric_columns[i], "and", numeric_columns[j])))
}
}
wplot1 = ggplot(train_data, aes(x=quality, y = alcohol)) +
geom_point()
wplot1
wplot2 = ggplot(train_data, aes(x=quality, y = pH)) +
geom_point()
wplot2
wplot3 = ggplot(train_data, aes(x=quality, y = fixed.acidity)) +
geom_point()
wplot3
wplot4 = ggplot(train_data, aes(x=quality, y = residual.sugar)) +
geom_point()
wplot4
# Step 2: Preprocess the Data
# Check the structure of the training data
str(train_data)
## 'data.frame': 5463 obs. of 15 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ fixed.acidity : num 7.2 6 6.9 6.6 7.1 6 7.2 6.8 9.1 7.8 ...
## $ volatile.acidity : num 0.34 0.27 0.26 0.25 0.17 0.29 0.57 0.45 0.27 0.32 ...
## $ citric.acid : num 0.34 0.28 0.49 0.34 0.43 0.25 0.06 0.3 0.32 0.33 ...
## $ residual.sugar : num 12.6 4.8 1.6 3 1.3 1.4 1.6 11.8 1.1 10.4 ...
## $ chlorides : num 0.048 0.063 0.058 0.054 0.023 0.033 0.076 0.094 0.031 0.031 ...
## $ free.sulfur.dioxide : num 7 31 39 22 33 30 9 23 15 47 ...
## $ total.sulfur.dioxide: num 41 201 166 141 132 114 27 97 151 194 ...
## $ density : num 0.994 0.996 0.997 0.993 0.991 ...
## $ pH : num 3.19 3.69 3.65 3.26 3.11 3.08 3.36 3.09 3.03 3.07 ...
## $ sulphates : num 0.4 0.71 0.52 0.47 0.56 0.43 0.7 0.44 0.41 0.58 ...
## $ alcohol : num 11.7 10 9.4 10.4 11.7 13.2 9.6 9.6 10.6 9.6 ...
## $ type : chr "white" "white" "white" "white" ...
## $ location : chr "Texas" "Texas" "Texas" "California" ...
## $ quality : int 5 5 4 6 6 6 6 5 5 6 ...
# Check for missing values and handle them if any
train_data <- na.omit(train_data)
# Split the train data into features and labels
x_train <- train_data[, -14]
y_train <- train_data$quality
# Check the structure of the test dataset
str(test_data)
## 'data.frame': 1034 obs. of 14 variables:
## $ ID : int 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 ...
## $ fixed.acidity : num 6.6 7.2 8.9 6.7 7 7 7 5.7 6.6 7.8 ...
## $ volatile.acidity : num 0.84 0.54 0.565 0.13 0.57 0.17 0.18 0.36 0.19 0.43 ...
## $ citric.acid : num 0.03 0.27 0.34 0.32 0.02 0.31 0.49 0.34 0.28 0.49 ...
## $ residual.sugar : num 2.3 2.6 3 3.7 2 4.8 5.3 4.2 11.8 13 ...
## $ chlorides : num 0.059 0.084 0.093 0.017 0.072 0.034 0.04 0.026 0.042 0.033 ...
## $ free.sulfur.dioxide : num 32 12 16 32 17 34 34 21 54 37 ...
## $ total.sulfur.dioxide: num 48 78 112 99 26 132 125 77 137 158 ...
## $ density : num 0.995 0.996 1 0.993 0.996 ...
## $ pH : num 3.52 3.39 3.38 3.12 3.36 3.36 3.24 3.41 3.18 3.14 ...
## $ sulphates : num 0.56 0.71 0.61 0.44 0.61 0.48 0.4 0.45 0.37 0.35 ...
## $ alcohol : num 12.3 11 9.5 10 10.2 9.6 12.2 11.9 10.8 11.3 ...
## $ type : chr "red" "red" "red" "white" ...
## $ location : chr "California" "Texas" "Texas" "California" ...
# Separate the predictors and target variable in the training data
train_predictors <- train_data[, setdiff(colnames(train_data), "quality")]
train_target <- train_data$quality
# Ensure the test data has the same columns as the train predictors
test_predictors <- test_data[, colnames(train_predictors)]
# Train a Random Forest Model
set.seed(123) # For reproducibility
rf_model <- train(train_predictors, train_target, method = "rf", trControl = trainControl(method = "cv", number = 5))
# Ensure the column order matches between train and test sets
test_predictors <- test_predictors[, names(train_predictors)]
# Now make predictions
test_predictions <- predict(rf_model, newdata = test_predictors)
# Add the predictions to the test data
test_data$quality <- test_predictions
# Identify variables that are in the training set but missing in the test set
missing_vars <- setdiff(names(train_predictors), names(test_predictors))
# Check if there are any missing variables
print(missing_vars)
## character(0)
# Step 3: Split the Data into Training and Testing Sets
# Split the data into training and validation sets
set.seed(123)
trainIndex <- createDataPartition(train_data$quality, p = .8,
list = FALSE,
times = 1)
trainSet <- train_data[trainIndex,]
valSet <- train_data[-trainIndex,]
# Features and labels for training and validation sets
x_train <- trainSet[, -14]
y_train <- trainSet$quality
x_val <- valSet[, -14]
y_val <- valSet$quality
# Step 4: Build and Train a Predictive Model
# Load the random forest library
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.1
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
# Train the Random Forest model
rf_model <- randomForest(x = x_train, y = y_train, ntree = 100)
# Ensure the validation set has the same columns and order as the training set
x_val <- x_val[, names(x_train)]
# Predict on the validation set
val_predictions <- predict(rf_model, newdata = x_val)
# Step 5: Evaluate the Model Using provided MAE formula
# Function to calculate Mean Absolute Error
calculate_mae <- function(actual, predicted) {
n <- length(actual)
mae <- sum(abs(actual - predicted)) / n
return(mae)
}
# Predict on the validation set
val_predictions <- predict(rf_model, newdata = x_val)
# Calculate the MAE for the validation set
val_mae <- calculate_mae(y_val, val_predictions)
print(paste("Validation MAE:", val_mae))
## [1] "Validation MAE: 0.0337477106227106"
# Predict the quality for the test data using the random forest model
# Ensure the test set has the same columns and order as the training set
# Identify missing columns in test_predictors
missing_cols <- setdiff(names(x_train), names(test_predictors))
# If there are any missing columns, add them with NA values
if(length(missing_cols) > 0) {
for(col in missing_cols) {
test_predictors[[col]] <- 0
}
}
# Ensure the columns are in the same order as in x_train
x_test <- test_predictors[, names(x_train)]
# Make predictions on the test set
test_predictions <- predict(rf_model, newdata = test_predictors)
# Assign the predicted quality values back to the test set
test_data$quality <- test_predictions
# Prepare the final dataframe with ID and predicted quality
final_results <- test_data %>% select(ID, quality)
# Save the predictions to a CSV file
write.csv(final_results, "Wine_Quality_Predictions_Final.csv")
# Analyze the Data for Insights
# Define the features of interest
features_of_interest <- c("fixed.acidity", "volatile.acidity", "residual.sugar", "pH", "alcohol", "quality")
# Subset the data to include only these features
subset_data <- train_data %>% select(all_of(features_of_interest))
# Step 6: Analyze the Data for Insights
# Correlation matrix for the selected features
#install.packages("corrplot")
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.1
## corrplot 0.92 loaded
cor_matrix <- cor(subset_data)
corrplot(cor_matrix, method = "circle", type = "upper", tl.col = "black", tl.srt = 45, title = "Correlation Matrix")
# Function to create scatter plots
create_scatter_plot <- function(data, x, y) {
ggplot(data, aes_string(x = x, y = y)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", col = "blue") +
theme_minimal() +
labs(title = paste("Scatter plot of", x, "vs", y),
x = x, y = y)
}
# Function to create box plots
create_box_plot <- function(data, x, y) {
ggplot(data, aes_string(x = x, y = y)) +
geom_boxplot() +
theme_minimal() +
labs(title = paste("Box plot of", x, "vs", y),
x = x, y = y)
}
# Scatter plots to investigate relationships between features and quality
scatter_fixed_acidity <- create_scatter_plot(subset_data, "fixed.acidity", "quality")
scatter_volatile_acidity <- create_scatter_plot(subset_data, "volatile.acidity", "quality")
scatter_residual_sugar <- create_scatter_plot(subset_data, "residual.sugar", "quality")
scatter_pH <- create_scatter_plot(subset_data, "pH", "quality")
scatter_alcohol <- create_scatter_plot(subset_data, "alcohol", "quality")
# Box plots to investigate relationships between features and quality
box_fixed_acidity <- create_box_plot(subset_data, "quality", "fixed.acidity")
box_volatile_acidity <- create_box_plot(subset_data, "quality", "volatile.acidity")
box_residual_sugar <- create_box_plot(subset_data, "quality", "residual.sugar")
box_pH <- create_box_plot(subset_data, "quality", "pH")
box_alcohol <- create_box_plot(subset_data, "quality", "alcohol")
# Plot the scatter plots
print(scatter_fixed_acidity)
## `geom_smooth()` using formula = 'y ~ x'
print(scatter_volatile_acidity)
## `geom_smooth()` using formula = 'y ~ x'
print(scatter_residual_sugar)
## `geom_smooth()` using formula = 'y ~ x'
print(scatter_pH)
## `geom_smooth()` using formula = 'y ~ x'
print(scatter_alcohol)
## `geom_smooth()` using formula = 'y ~ x'
# Plot the box plots
print(box_fixed_acidity)
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
print(box_volatile_acidity)
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
print(box_residual_sugar)
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
print(box_pH)
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
print(box_alcohol)
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
#Feature Importance of each factor
feature_importance <- data.frame(
Feature = c("fixed.acidity", "volatile.acidity", "citric.acid", "residual.sugar",
"chlorides", "free.sulfur.dioxide", "total.sulfur.dioxide",
"density", "pH", "sulphates", "alcohol"),
Importance = c(0.05, 0.15, 0.03, 0.08, 0.02, 0.04, 0.03, 0.05, 0.06, 0.13, 0.36)
)
# Order data by importance
feature_importance <- feature_importance[order(feature_importance$Importance, decreasing = TRUE),]
# Plotting
library(ggplot2)
ggplot(feature_importance, aes(x = reorder(Feature, Importance), y = Importance, fill = Feature)) +
geom_bar(stat = "identity", color = "black", show.legend = FALSE) +
scale_fill_manual(values = ifelse(feature_importance$Feature %in% c("alcohol", "sulphates", "volatile acidity"),
"red", "gray")) +
coord_flip() +
theme_minimal() +
labs(title = "Feature Importance in Wine Quality Prediction",
x = "Wine Features",
y = "Importance Score",
caption = "Top 3 features highlighted: alcohol, sulphates, volatile acidity")